#Load from CSV file
data_file_location = "/Users/ravisivaraman/Downloads/bank-additional/bank-additional-full.csv"
bank_Data <- read.csv2(data_file_location, header = TRUE, sep = ";")
# Convert response variable to a factor
bank_Data$y<-as.factor(bank_Data$y)
# Check summary of original data
summary(bank_Data)
## age job marital education
## Min. :17.00 Length:41188 Length:41188 Length:41188
## 1st Qu.:32.00 Class :character Class :character Class :character
## Median :38.00 Mode :character Mode :character Mode :character
## Mean :40.02
## 3rd Qu.:47.00
## Max. :98.00
## default housing loan contact
## Length:41188 Length:41188 Length:41188 Length:41188
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## month day_of_week duration campaign
## Length:41188 Length:41188 Min. : 0.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 102.0 1st Qu.: 1.000
## Mode :character Mode :character Median : 180.0 Median : 2.000
## Mean : 258.3 Mean : 2.568
## 3rd Qu.: 319.0 3rd Qu.: 3.000
## Max. :4918.0 Max. :56.000
## pdays previous poutcome emp.var.rate
## Min. : 0.0 Min. :0.000 Length:41188 Length:41188
## 1st Qu.:999.0 1st Qu.:0.000 Class :character Class :character
## Median :999.0 Median :0.000 Mode :character Mode :character
## Mean :962.5 Mean :0.173
## 3rd Qu.:999.0 3rd Qu.:0.000
## Max. :999.0 Max. :7.000
## cons.price.idx cons.conf.idx euribor3m nr.employed
## Length:41188 Length:41188 Length:41188 Length:41188
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## y
## no :36548
## yes: 4640
##
##
##
##
# Several "Unknown" or missing data. Assigning all unknown as NA
bank_Data[bank_Data=="unknown"] <- NA
sum(is.na(bank_Data))
## [1] 12718
# Removing all the NAs
bank.nona = na.omit(bank_Data)
sum(is.na(bank.nona))
## [1] 0
# Since the data is imbalance and the number of "Yes" for the response variable are very small compared to "No". Downsampling the data to make it balanced.
set.seed(12345)
bank.ds <-downSample(bank.nona, y=bank.nona$y)
summary(bank.ds)
## age job marital education
## Min. :17.00 Length:7718 Length:7718 Length:7718
## 1st Qu.:31.00 Class :character Class :character Class :character
## Median :36.00 Mode :character Mode :character Mode :character
## Mean :39.55
## 3rd Qu.:46.00
## Max. :89.00
## default housing loan contact
## Length:7718 Length:7718 Length:7718 Length:7718
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## month day_of_week duration campaign
## Length:7718 Length:7718 Min. : 2.0 Min. : 1.00
## Class :character Class :character 1st Qu.: 142.0 1st Qu.: 1.00
## Mode :character Mode :character Median : 257.0 Median : 2.00
## Mean : 375.2 Mean : 2.28
## 3rd Qu.: 499.0 3rd Qu.: 3.00
## Max. :4199.0 Max. :43.00
## pdays previous poutcome emp.var.rate
## Min. : 0 Min. :0.0000 Length:7718 Length:7718
## 1st Qu.:999 1st Qu.:0.0000 Class :character Class :character
## Median :999 Median :0.0000 Mode :character Mode :character
## Mean :881 Mean :0.3332
## 3rd Qu.:999 3rd Qu.:0.0000
## Max. :999 Max. :6.0000
## cons.price.idx cons.conf.idx euribor3m nr.employed
## Length:7718 Length:7718 Length:7718 Length:7718
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Class
## no :3859
## yes:3859
##
##
##
##
# Splitting into Categorical variables
bank.Fact <- bank.ds[, c(2,3,4,6,7,8,9,10,15,21)]
bank.Fact1 <- bank.ds[, c(2,3,4,6,21)]
bank.Fact2 <- bank.ds[, c(7,8,9,10,15,21)]
# Splitting into Continous variables
bank.Cont <- bank.ds[, c(1,11,12,13,14,16,17,18,19,20)]
bank.Cont1 <- bank.ds[, c(1,11,12,13,14,21)] # Adding Response variable also
bank.Cont2 <- bank.ds[, c(16,17,18,19,20,21)] # Adding Response variable also
# Categorical
bank_cat1 = bank.ds[,2:10]
str(bank.ds)
## 'data.frame': 7718 obs. of 21 variables:
## $ age : int 37 34 31 41 36 40 28 32 28 33 ...
## $ job : Factor w/ 11 levels "admin.","blue-collar",..: 1 1 1 10 8 5 10 10 11 10 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 3 2 2 2 3 3 3 1 3 3 ...
## $ education : Factor w/ 7 levels "basic.4y","basic.6y",..: 7 4 7 6 4 7 6 6 4 6 ...
## $ default : Factor w/ 1 level "no": 1 1 1 1 1 1 1 1 1 1 ...
## $ housing : Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 2 2 1 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 2 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 1 1 1 1 1 1 2 1 2 1 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 2 4 8 9 4 7 7 2 7 4 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 1 5 4 2 1 5 5 1 4 ...
## $ duration : int 220 59 596 304 719 52 229 11 109 312 ...
## $ campaign : int 2 7 2 2 2 1 2 12 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ emp.var.rate : num 1.4 -2.9 -0.1 -3.4 1.4 -1.8 1.1 1.4 1.1 1.4 ...
## $ cons.price.idx: num 93.4 92.5 93.2 92.4 93.9 ...
## $ cons.conf.idx : num -36.1 -33.6 -42 -26.9 -42.7 -46.2 -36.4 -36.1 -36.4 -42.7 ...
## $ euribor3m : num 4.965 1.059 4.12 0.744 4.962 ...
## $ nr.employed : num 5228 5076 5196 5018 5228 ...
## $ Class : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
pairs(bank1)
bank.ds$y=bank.ds$Class
attach(bank.ds)
aggregate(y~age,data=bank.ds,summary)
## age y.no y.yes
## 1 17 1 0
## 2 18 1 6
## 3 19 3 10
## 4 20 5 16
## 5 21 9 23
## 6 22 13 32
## 7 23 28 45
## 8 24 42 67
## 9 25 61 74
## 10 26 80 114
## 11 27 72 96
## 12 28 82 137
## 13 29 145 173
## 14 30 185 179
## 15 31 211 196
## 16 32 188 158
## 17 33 215 187
## 18 34 181 167
## 19 35 205 146
## 20 36 191 134
## 21 37 159 117
## 22 38 122 114
## 23 39 122 96
## 24 40 113 70
## 25 41 124 89
## 26 42 120 78
## 27 43 101 73
## 28 44 88 55
## 29 45 91 61
## 30 46 93 62
## 31 47 77 46
## 32 48 77 86
## 33 49 68 37
## 34 50 59 71
## 35 51 69 56
## 36 52 55 61
## 37 53 58 53
## 38 54 63 51
## 39 55 47 42
## 40 56 48 63
## 41 57 48 45
## 42 58 37 51
## 43 59 34 49
## 44 60 18 47
## 45 61 4 28
## 46 62 2 18
## 47 63 2 12
## 48 64 1 22
## 49 65 4 17
## 50 66 6 22
## 51 67 0 8
## 52 68 1 15
## 53 69 2 14
## 54 70 3 17
## 55 71 3 18
## 56 72 4 11
## 57 73 2 11
## 58 74 1 12
## 59 75 2 8
## 60 76 2 14
## 61 77 0 8
## 62 78 3 9
## 63 79 1 7
## 64 80 1 15
## 65 81 1 5
## 66 82 2 9
## 67 83 1 8
## 68 84 0 2
## 69 85 2 2
## 70 86 0 2
## 71 87 0 1
## 72 88 0 9
## 73 89 0 2
plot(age~y,col=c("red","blue"))
plot(campaign~y, col=c("red","blue"))
plot(duration~y, col=c("red","blue"))
plot(emp.var.rate~y, col=c("red","blue"))
plot(cons.conf.idx~y, col=c("red","blue"))
ggplot(bank.ds,aes(x = month, fill = y)) + geom_bar(position = "dodge")
ggplot(bank.ds,aes(x = marital, fill = y)) + geom_bar(position = "dodge")
ggplot(bank.ds,aes(x = education, fill = y)) + geom_bar(position = "dodge")
ggplot(bank.ds,aes(x = default, fill = y)) + geom_bar(position = "dodge")
ggplot(bank.ds,aes(x = poutcome, fill = y)) + geom_bar(position = "dodge")
ggplot(bank.ds,aes(x = day_of_week, fill = y)) + geom_bar(position = "dodge")
ggplot(bank.ds, aes(x = y, fill = marital)) + geom_density() + labs(title = " Distribution by Marital Status")
#
## All Default values are NO
summary(bank.ds$default)
## no
## 7718
## Aggregates
aggregate(y~marital,data=bank.ds,summary)
## marital y.no y.yes
## 1 divorced 453 410
## 2 married 2254 2056
## 3 single 1152 1393
aggregate(y~month,data=bank.ds,summary)
## month y.no y.yes
## 1 apr 249 468
## 2 aug 607 533
## 3 dec 6 74
## 4 jul 613 512
## 5 jun 455 452
## 6 mar 24 246
## 7 may 1329 700
## 8 nov 471 365
## 9 oct 57 287
## 10 sep 48 222
aggregate(y~default,data=bank.ds,summary)
## default y.no y.yes
## 1 no 3859 3859
aggregate(y~education,data=bank.ds,summary)
## education y.no y.yes
## 1 basic.4y 292 326
## 2 basic.6y 200 136
## 3 basic.9y 548 380
## 4 high.school 984 934
## 5 illiterate 1 3
## 6 professional.course 551 538
## 7 university.degree 1283 1542
aggregate(y~poutcome,data=bank.ds,summary)
## poutcome y.no y.yes
## 1 failure 458 508
## 2 nonexistent 3334 2572
## 3 success 67 779
## GGpairs
ggpairs(bank1,aes(colour=y))
ggpairs(bank2,aes(colour=y))
ggpairs(bank_cat1,aes(colour=y))
####### PCA ##########
#
# pca1<-prcomp(bank2,scale.=TRUE)
# plot(prcomp(bank2))
# pc.scores<-pca1$x
# pairs(pc.scores)
# cor(pc.scores)
#
#
# ######
#
#
# prop.table(table(y,month),2)
# plot(y~month,col=c("red","blue"))
### Correlation test for factors and Continous variables
model.matrix(~0+., data=bank.Fact1) %>%
cor() %>%
ggcorrplot(show.diag = F, type="upper", lab=TRUE, lab_size=2.5, insig = "blank")+
theme(axis.text.x = element_text(angle = 60, hjust = 1,size = 8), axis.text.y = element_text(hjust = 1,size = 8))
model.matrix(~0+., data=bank.Fact2) %>%
cor() %>%
ggcorrplot(show.diag = F, type="upper", lab=TRUE, lab_size=2.5, insig = "blank")+
theme(axis.text.x = element_text(angle = 60, hjust = 1,size = 8), axis.text.y = element_text(hjust = 1,size = 8))
model.matrix(~0+., data=bank.Cont1) %>%
cor() %>%
ggcorrplot(show.diag = F, type="upper", lab=TRUE, lab_size=3, insig = "blank")+
theme(axis.text.x = element_text(angle = 60, hjust = 1,size = 10), axis.text.y = element_text(hjust = 1,size = 10))
model.matrix(~0+., data=bank.Cont2) %>%
cor() %>%
ggcorrplot(show.diag = F, type="upper", lab=TRUE, lab_size=3, insig = "blank")+
theme(axis.text.x = element_text(angle = 60, hjust = 1,size = 10), axis.text.y = element_text(hjust = 1,size = 10))
#Setup Data
bank_additional = read.csv(data_file_location, sep=";")
#Cleanup; remove na's
bank_additional[bank_additional=="unknown"] <- NA
bank_additional.nona = na.omit(bank_additional)
#Make columns as factors
bank_additional.nona$contact = as.factor(bank_additional.nona$contact)
bank_additional.nona$poutcome = as.factor(bank_additional.nona$poutcome)
bank_additional.nona$day_of_week = as.factor(bank_additional.nona$day_of_week)
bank_additional.nona$month = as.factor(bank_additional.nona$month)
bank_additional.nona$loan = as.factor(bank_additional.nona$loan)
bank_additional.nona$housing = as.factor(bank_additional.nona$housing)
bank_additional.nona$default = as.factor(bank_additional.nona$default)
bank_additional.nona$education = as.factor(bank_additional.nona$education)
bank_additional.nona$marital = as.factor(bank_additional.nona$marital)
bank_additional.nona$job = as.factor(bank_additional.nona$job)
bank_additional.nona$y = as.factor(bank_additional.nona$y)
#Down Sample
bankadditional_ds = downSample(bank_additional.nona, y=bank_additional.nona$y)
nrow(bankadditional_ds)
## [1] 7718
#Remove Default column from the dataframe (and y, it is copied as Class)
bankaddition_ds_nodef = within(bankadditional_ds, rm(default))
#Setup test and train dataset
smp_size <- floor(0.75 * nrow(bankaddition_ds_nodef))
set.seed(123)
train_bankdata <- sample(seq_len(nrow(bankaddition_ds_nodef)), size = smp_size)
train_bankdata_ds = bankaddition_ds_nodef[train_bankdata,]
test_bankdata_ds = bankaddition_ds_nodef[-train_bankdata,]
str(train_bankdata_ds)
## 'data.frame': 5788 obs. of 20 variables:
## $ age : int 70 33 35 56 37 32 36 32 29 46 ...
## $ job : Factor w/ 11 levels "admin.","blue-collar",..: 4 8 2 4 1 10 2 8 10 1 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 1 2 3 2 2 3 2 3 3 2 ...
## $ education : Factor w/ 7 levels "basic.4y","basic.6y",..: 1 4 3 1 7 4 1 4 6 4 ...
## $ housing : Factor w/ 2 levels "no","yes": 2 1 2 1 1 2 1 2 1 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 2 1 1 2 1 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 1 2 1 2 1 2 2 2 2 1 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 10 7 7 5 3 7 5 7 4 10 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 4 1 5 5 2 5 5 2 3 5 ...
## $ duration : int 161 437 143 204 206 202 357 158 575 408 ...
## $ campaign : int 1 2 1 1 2 1 4 3 4 1 ...
## $ pdays : int 999 999 999 999 6 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 1 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 3 2 2 2 2 2 ...
## $ emp.var.rate : num -3.4 1.1 -1.8 1.4 -3 1.1 1.4 1.1 1.4 -3.4 ...
## $ cons.price.idx: num 92.4 94 92.9 94.5 92.7 ...
## $ cons.conf.idx : num -29.8 -36.4 -46.2 -41.8 -33 -36.4 -41.8 -36.4 -42.7 -29.8 ...
## $ euribor3m : num 0.75 4.859 1.334 4.864 0.706 ...
## $ nr.employed : num 5018 5191 5099 5228 5024 ...
## $ Class : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 1 2 ...
colnames(train_bankdata_ds)
## [1] "age" "job" "marital" "education"
## [5] "housing" "loan" "contact" "month"
## [9] "day_of_week" "duration" "campaign" "pdays"
## [13] "previous" "poutcome" "emp.var.rate" "cons.price.idx"
## [17] "cons.conf.idx" "euribor3m" "nr.employed" "Class"
dat.train.x = model.matrix(Class~age+campaign+cons.conf.idx+cons.price.idx+contact+day_of_week+education+emp.var.rate+euribor3m+housing+job+loan+marital+month+nr.employed+pdays+poutcome+previous,train_bankdata_ds)
#dat.train.x = model.matrix(Class~.,train_bankdata_ds)
dat.train.y<- train_bankdata_ds[,20]
cvfit <- cv.glmnet(dat.train.x, dat.train.y, family = "binomial", type.measure = "class", nlambda = 1000)
finalmodel<-glmnet(dat.train.x, dat.train.y, family = "binomial",lambda=cvfit$lambda.min)
dat.test.x = model.matrix(Class~age+campaign+cons.conf.idx+cons.price.idx+contact+day_of_week+education+emp.var.rate+euribor3m+housing+job+loan+marital+month+nr.employed+pdays+poutcome+previous,test_bankdata_ds)
#dat.test.x = model.matrix(Class~.,test_bankdata_ds)
fit.pred.lasso <- predict(finalmodel, newx = dat.test.x, type = "response")
full.log<-glm(Class~age+campaign+cons.conf.idx+cons.price.idx+contact+day_of_week+education+emp.var.rate+euribor3m+housing+job+loan+marital+month+nr.employed+pdays+poutcome+previous,family="binomial",data=train_bankdata_ds)
#full.log<-glm(Class~.,family="binomial",data=train_bankdata_ds)
forward.step.log<-full.log %>% stepAIC(direction="forward", trace=FALSE)
backward.step.log<-full.log %>% stepAIC(direction="back", trace=FALSE)
both.step.log<-full.log %>% stepAIC(direction="both", trace=FALSE)
exp(cbind("Odds ratio" = coef(forward.step.log), confint.default(forward.step.log, level = 0.95)))
## Odds ratio 2.5 % 97.5 %
## (Intercept) 1.870418e-94 8.647698e-146 4.045543e-43
## age 1.000128e+00 9.924983e-01 1.007816e+00
## campaign 9.694594e-01 9.420718e-01 9.976432e-01
## cons.conf.idx 1.009591e+00 9.811927e-01 1.038812e+00
## cons.price.idx 7.549592e+00 3.430317e+00 1.661547e+01
## contacttelephone 4.977718e-01 3.918306e-01 6.323569e-01
## day_of_weekmon 8.329301e-01 6.807647e-01 1.019108e+00
## day_of_weekthu 9.971352e-01 8.182358e-01 1.215149e+00
## day_of_weektue 9.650852e-01 7.901139e-01 1.178804e+00
## day_of_weekwed 1.267813e+00 1.037211e+00 1.549684e+00
## educationbasic.6y 1.318427e+00 9.007611e-01 1.929758e+00
## educationbasic.9y 1.101583e+00 8.130921e-01 1.492434e+00
## educationhigh.school 1.163220e+00 8.622221e-01 1.569295e+00
## educationilliterate 5.240160e+00 4.536076e-01 6.053531e+01
## educationprofessional.course 1.149827e+00 8.273811e-01 1.597937e+00
## educationuniversity.degree 1.371970e+00 1.012983e+00 1.858177e+00
## emp.var.rate 2.138751e-01 1.363260e-01 3.355382e-01
## euribor3m 1.351091e+00 8.837960e-01 2.065463e+00
## housingyes 9.606988e-01 8.481378e-01 1.088198e+00
## jobblue-collar 9.891484e-01 7.786658e-01 1.256527e+00
## jobentrepreneur 1.075043e+00 7.468508e-01 1.547453e+00
## jobhousemaid 1.011880e+00 6.466381e-01 1.583422e+00
## jobmanagement 9.224340e-01 7.149562e-01 1.190121e+00
## jobretired 1.472305e+00 1.024983e+00 2.114848e+00
## jobself-employed 1.081434e+00 7.642356e-01 1.530287e+00
## jobservices 9.859002e-01 7.676488e-01 1.266203e+00
## jobstudent 1.893147e+00 1.212400e+00 2.956125e+00
## jobtechnician 1.251546e+00 1.011042e+00 1.549261e+00
## jobunemployed 1.280300e+00 8.494467e-01 1.929688e+00
## loanyes 9.304898e-01 7.845191e-01 1.103620e+00
## maritalmarried 1.103767e+00 9.041222e-01 1.347496e+00
## maritalsingle 1.133466e+00 9.048773e-01 1.419800e+00
## monthaug 2.098534e+00 1.354230e+00 3.251920e+00
## monthdec 1.504521e+00 6.777552e-01 3.339825e+00
## monthjul 1.281441e+00 9.433188e-01 1.740760e+00
## monthjun 5.776635e-01 3.831440e-01 8.709391e-01
## monthmar 4.883722e+00 2.808021e+00 8.493790e+00
## monthmay 7.225746e-01 5.579613e-01 9.357531e-01
## monthnov 6.934656e-01 4.806058e-01 1.000601e+00
## monthoct 1.172986e+00 7.110574e-01 1.935001e+00
## monthsep 1.693977e+00 9.190046e-01 3.122462e+00
## nr.employed 1.005117e+00 9.954929e-01 1.014834e+00
## pdays 9.987639e-01 9.977881e-01 9.997407e-01
## poutcomenonexistent 1.516538e+00 1.043937e+00 2.203090e+00
## poutcomesuccess 1.671215e+00 6.285813e-01 4.443274e+00
## previous 9.525131e-01 7.186014e-01 1.262565e+00
fit.pred.step.forward<-predict(forward.step.log,newdata=test_bankdata_ds,type="response")
fit.pred.step.backward<-predict(backward.step.log,newdata=test_bankdata_ds,type="response")
fit.pred.step.both<-predict(both.step.log,newdata=test_bankdata_ds,type="response")
Find the best cutoff
accuracy_step.forward = c()
accuracy_step.backward = c()
accuracy_step.both = c()
accuracy_lasso = c()
sensitivity_step.forward = c()
sensitivity_step.backward = c()
sensitivity_step.both = c()
sensitivity_lasso = c()
specificity_step.forward = c()
specificity_step.backward = c()
specificity_step.both = c()
specificity_lasso = c()
for (counter in 1:9)
{
cutoff=0.1 * counter
class.step.forward<-factor(ifelse(fit.pred.step.forward<cutoff,"no","yes"),levels=c("no","yes"))
conf.step.forward<-table(class.step.forward,test_bankdata_ds$Class)
cmval = confusionMatrix(conf.step.forward)
accuracy_step.forward[counter] = cmval$overall[1]
sensitivity_step.forward[counter] = cmval$byClass[1]
specificity_step.forward[counter] = cmval$byClass[2]
class.step.backward<-factor(ifelse(fit.pred.step.backward<cutoff,"no","yes"),levels=c("no","yes"))
conf.step.backward<-table(class.step.backward,test_bankdata_ds$Class)
cmval = confusionMatrix(conf.step.backward)
accuracy_step.backward[counter] = cmval$overall[1]
sensitivity_step.backward[counter] = cmval$byClass[1]
specificity_step.backward[counter] = cmval$byClass[2]
class.step.both<-factor(ifelse(fit.pred.step.both<cutoff,"no","yes"),levels=c("no","yes"))
conf.step.both<-table(class.step.both,test_bankdata_ds$Class)
cmval = confusionMatrix(conf.step.both)
accuracy_step.both[counter] = cmval$overall[1]
sensitivity_step.both[counter] = cmval$byClass[1]
specificity_step.both[counter] = cmval$byClass[2]
#Lasso accuracy
class.lasso<-factor(ifelse(fit.pred.lasso<cutoff,"no","yes"),levels=c("no","yes"))
conf.lasso<-table(class.lasso,test_bankdata_ds$Class)
cmval = confusionMatrix(conf.lasso)
accuracy_lasso[counter] = cmval$overall[1]
sensitivity_lasso[counter] = cmval$byClass[1]
specificity_lasso[counter] = cmval$byClass[2]
}
plot(accuracy_step.forward, type="l", col="red",ylim= c(0,1), xlab = "Cutoff/10", ylab="Accuracy")
lines(accuracy_step.backward, col="orange", type="b", lwd=1.5)
lines(accuracy_step.both, col="pink", type="b", lwd=1.5)
lines(accuracy_lasso, col="blue", type="b", lwd=1.5)
plot(sensitivity_step.forward, type="l", col="red",ylim= c(0,1), xlab = "Cutoff/10", ylab="Sensitivity")
lines(sensitivity_step.backward, col="orange", type="b", lwd=1.5)
lines(sensitivity_step.both, col="pink", type="b", lwd=1.5)
lines(sensitivity_lasso, col="blue", type="b", lwd=1.5)
plot(specificity_step.forward, type="l", col="red",ylim= c(0,1), xlab = "Cutoff/10", ylab="Specificity")
lines(specificity_step.backward, col="orange", type="b", lwd=1.5)
lines(specificity_step.both, col="pink", type="b", lwd=1.5)
lines(specificity_lasso, col="blue", type="b", lwd=1.5)
par(pch=22, col="blue") # plotting symbol and color
par(mfrow=c(2,4)) # all plots on one page
plot(accuracy_step.forward, type="l", col="red",ylim= c(0,1), xlab = "Cutoff/10", main="Forward")
lines(sensitivity_step.forward, col="orange", type="b", lwd=1.5)
lines(specificity_step.forward, col="pink", type="b", lwd=1.5)
plot(accuracy_step.backward, type="l", col="red",ylim= c(0,1), xlab = "Cutoff/10", main="Backward")
lines(sensitivity_step.backward, col="orange", type="b", lwd=1.5)
lines(specificity_step.backward, col="pink", type="b", lwd=1.5)
plot(accuracy_step.both, type="l", col="red",ylim= c(0,1), xlab = "Cutoff/10", main="Both")
lines(sensitivity_step.both, col="orange", type="b", lwd=1.5)
lines(specificity_step.both, col="pink", type="b", lwd=1.5)
plot(accuracy_lasso, type="l", col="red",ylim= c(0,1), xlab = "Cutoff/10", main="Lasso")
lines(sensitivity_lasso, col="orange", type="b", lwd=1.5)
lines(specificity_lasso, col="pink", type="b", lwd=1.5)
imp = varImp(forward.step.log, scale=FALSE)
#Plot importance
ggplot2::ggplot(imp, aes(x=reorder(rownames(imp),Overall), y=Overall)) +
geom_point( color="blue", size=4, alpha=0.6)+
geom_segment( aes(x=rownames(imp), xend=rownames(imp), y=0, yend=Overall),
color='skyblue') +
xlab('Variable')+
ylab('Overall Importance')+
theme_light() +
coord_flip()
## Goodness of Fit Test Chisq-test is used to compare observed sample distribution with the expected probability distribution. If the p-value is significant then model with categorical predictors have captured the expected probability distribution.
chisq = chisq.test(table(class.step.forward,test_bankdata_ds$Class), p=c(.5, .5))
The p-value is 2.890633910^{-44} which is less than α, which is significant. The logistic regression model has captured the categorical predictors well.
KNN Analysis of the data.
bank.ds.red1 <- subset(bank.ds, select = -c(duration, pdays, default,day_of_week,cons.conf.idx) )
bank.ds.red2 <- subset(bank.ds, select = -c(age,job, marital, education, duration, pdays, default,day_of_week,cons.conf.idx, nr.employed) )
bank.ds.red3 <- subset(bank.ds, select = -c(age,job, marital, loan, housing, previous, education, duration, pdays, default,day_of_week,cons.conf.idx, nr.employed ) )
KNN.Variables <- bank.ds.red1
# In-order to use the factors for KNN Prediction. Convert Factors to Integer
KNN.Variables$job<-as.integer(KNN.Variables$job)
KNN.Variables$marital<-as.integer(KNN.Variables$marital)
KNN.Variables$education<-as.integer(KNN.Variables$education)
KNN.Variables$housing<-as.integer(KNN.Variables$housing)
KNN.Variables$loan<-as.integer(KNN.Variables$loan)
KNN.Variables$contact<-as.integer(KNN.Variables$contact)
KNN.Variables$month<-as.integer(KNN.Variables$month)
KNN.Variables$poutcome<-as.integer(KNN.Variables$poutcome)
# Normalize all the variables
KNN.DF.N <- data.frame(age = scale(KNN.Variables$age), job = scale(KNN.Variables$job),marital = scale(KNN.Variables$marital),
education = scale(KNN.Variables$education), housing = scale(KNN.Variables$housing), loan = scale(KNN.Variables$loan),
contact = scale(KNN.Variables$contact), month = scale(KNN.Variables$month), campaign = scale(KNN.Variables$campaign),
previous = scale(KNN.Variables$previous), poutcome = scale(KNN.Variables$poutcome), emp.var.rate = scale(KNN.Variables$emp.var.rate),
cons.price.idx = scale(KNN.Variables$cons.price.idx), euribor3m = scale(KNN.Variables$euribor3m), nr.employed = scale(KNN.Variables$nr.employed), y=KNN.Variables$y)
# Run 10 iterations on different train/test sets. We will compute the average accuracy, specificity and Sensitivity.
iterations = 10
numks = 50
masterAcc = matrix(nrow = iterations,ncol = numks)
masterSensitivity = matrix(nrow = iterations,ncol = numks)
masterSpecificity = matrix(nrow = iterations,ncol = numks)
splitPerc = .85 #Training / Test split Percentage
for(j in 1:iterations)
{
splitPerc = .85
set.seed(12345)
trainIndices = sample(1:dim(KNN.DF.N)[1],round(splitPerc * dim(KNN.DF.N)[1]))
train.KNN.DF = KNN.DF.N[trainIndices,]
test.KNN.DF = KNN.DF.N[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(train.KNN.DF[,c(1:15)], test.KNN.DF[,c(1:15)],train.KNN.DF$y,prob=TRUE, k=i)
table(classifications,test.KNN.DF$y)
CM = confusionMatrix(table(classifications,test.KNN.DF$y))
masterAcc[j,i] = CM$overall[1]
masterSensitivity[j,i] = CM$byClass[1]
masterSpecificity[j,i] = CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSensitivity = colMeans(masterSensitivity)
MeanSpecificity = colMeans(masterSpecificity)
masterAcc[j] = CM$overall[1]
masterSensitivity[j] = CM$byClass[1]
masterSpecificity[j] = CM$byClass[2]
}
plot(seq(1,numks,1),MeanAcc, type = "l", main = "Plot of value of K vs. Accuracy", xlab = "Value of K", ylab="Accuracy %")
# Best K is at 45 with 73.76% Accuracy, Sensitivity is 78.20% and Specificity is 69.33%
MeanAcc = mean(colMeans(masterAcc))
MeanSpecificity = mean(colMeans(masterSpecificity))
MeanSensitivity = mean(colMeans(masterSensitivity))
print(paste("Best Accuracy at K = ", which.max(colMeans(masterAcc)), "of", max(colMeans(masterAcc))*100,"%"))
## [1] "Best Accuracy at K = 42 of 74.3523316062176 %"
print(paste("Mean Accuracy = ", MeanAcc*100,"%"))
## [1] "Mean Accuracy = 72.3851468048359 %"
print(paste("Mean Sensitivity = ", MeanSensitivity*100,"%"))
## [1] "Mean Sensitivity = 77.1205673758865 %"
print(paste("Mean Specificity = ", MeanSpecificity*100,"%"))
## [1] "Mean Specificity = 67.8888888888889 %"
# Run KNN Model for K=41
classifications.K41 = knn(train.KNN.DF[,c(1:15)], test.KNN.DF[,c(1:15)],train.KNN.DF$y,prob=TRUE, k=41)
#table(classifications.K41,test.AT.DF3$Attrition)
CM.K41 = confusionMatrix(table(classifications.K41,test.KNN.DF$y))
print(paste("Accuracy for K-41 = ", CM.K41$overall[1]*100,"%"))
## [1] "Accuracy for K-41 = 74.006908462867 %"
print(paste("Sensitivity for K-41 = ", CM.K41$byClass[1]*100,"%"))
## [1] "Sensitivity for K-41 = 79.2553191489362 %"
print(paste("Specificity for K-41 = ", CM.K41$byClass[2]*100,"%"))
## [1] "Specificity for K-41 = 69.023569023569 %"
# KNN with ONLY CONTINOUS VARIABLE Marketing Data===
KNN.Variables <- bank.Cont
# Normalize all the variables
KNN.DF.N <- data.frame(age = scale(KNN.Variables$age),campaign = scale(KNN.Variables$campaign), pdays=scale(KNN.Variables$pdays), emp.var.rate = scale(KNN.Variables$emp.var.rate),
cons.price.idx = scale(KNN.Variables$cons.price.idx), cons.conf.idx = scale(KNN.Variables$cons.conf.idx), euribor3m = scale(KNN.Variables$euribor3m),
nr.employed = scale(KNN.Variables$nr.employed), y=bank.ds$y)
# Run 10 iterations on different train/test sets. We will compute the average accuracy, specificity and Sensitivity.
iterations = 10
numks = 50
masterAcc = matrix(nrow = iterations,ncol = numks)
masterSensitivity = matrix(nrow = iterations,ncol = numks)
masterSpecificity = matrix(nrow = iterations,ncol = numks)
splitPerc = .85 #Training / Test split Percentage
for(j in 1:iterations)
{
splitPerc = .85
set.seed(12345)
trainIndices = sample(1:dim(KNN.DF.N)[1],round(splitPerc * dim(KNN.DF.N)[1]))
train.KNN.DF = KNN.DF.N[trainIndices,]
test.KNN.DF = KNN.DF.N[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(train.KNN.DF[,c(1:8)], test.KNN.DF[,c(1:8)],train.KNN.DF$y,prob=TRUE, k=i)
table(classifications,test.KNN.DF$y)
CM = confusionMatrix(table(classifications,test.KNN.DF$y))
masterAcc[j,i] = CM$overall[1]
masterSensitivity[j,i] = CM$byClass[1]
masterSpecificity[j,i] = CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSensitivity = colMeans(masterSensitivity)
MeanSpecificity = colMeans(masterSpecificity)
masterAcc[j] = CM$overall[1]
masterSensitivity[j] = CM$byClass[1]
masterSpecificity[j] = CM$byClass[2]
}
plot(seq(1,numks,1),MeanAcc, type = "l", main = "Plot of value of K vs. Accuracy", xlab = "Value of K", ylab="Accuracy %")
#which.max(MeanAcc)
#max(MeanSensitivity)
#max(MeanSpecificity)
# Best K is at 20 with 73.83% Accuracy, Sensitivity is 78.72% and Specificity is 69.19%
MeanAcc = mean(colMeans(masterAcc))
MeanSpecificity = mean(colMeans(masterSpecificity))
MeanSensitivity = mean(colMeans(masterSensitivity))
print(paste("Best Accuracy at K = ", which.max(colMeans(masterAcc)), "of", max(colMeans(masterAcc))*100,"%"))
## [1] "Best Accuracy at K = 45 of 74.6977547495682 %"
print(paste("Mean Accuracy = ", MeanAcc*100,"%"))
## [1] "Mean Accuracy = 73.5164075993091 %"
print(paste("Mean Sensitivity = ", MeanSensitivity*100,"%"))
## [1] "Mean Sensitivity = 77.4893617021277 %"
print(paste("Mean Specificity = ", MeanSpecificity*100,"%"))
## [1] "Mean Specificity = 69.7441077441077 %"
# Run KNN Model for K=20
classifications.K20 = knn(train.KNN.DF[,c(1:8)], test.KNN.DF[,c(1:8)],train.KNN.DF$y,prob=TRUE, k=20)
#table(classifications.K41,test.AT.DF3$Attrition)
CM.K20 = confusionMatrix(table(classifications.K20,test.KNN.DF$y))
print(paste("Accuracy for K-20 = ", CM.K41$overall[1]*100,"%"))
## [1] "Accuracy for K-20 = 74.006908462867 %"
print(paste("Sensitivity for K-20 = ", CM.K41$byClass[1]*100,"%"))
## [1] "Sensitivity for K-20 = 79.2553191489362 %"
print(paste("Specificity for K-20 = ", CM.K41$byClass[2]*100,"%"))
## [1] "Specificity for K-20 = 69.023569023569 %"
df = bank_additional
#df <- subset(df, select = c(y, age, campaign, pdays, previous, emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed, default, housing, loan, poutcome))
df <- subset(df, select = c(y, age, euribor3m))
df[c("y")]<- lapply(df[c("y")], factor)
#df[c("default")]<- lapply(df[c("default")], factor)
#df[c("housing")]<- lapply(df[c("housing")], factor)
#df[c("loan")]<- lapply(df[c("loan")], factor)
#df[c("poutcome")]<- lapply(df[c("poutcome")], factor)
dim(df) #rows and columns
## [1] 41188 3
df_clean = df
set.seed(12345)
df_clean <- df_clean[!grepl('unknown', df_clean$default),]
df_clean <- df_clean[!grepl('unknown', df_clean$housing),]
df_clean <- df_clean[!grepl('unknown', df_clean$loan),]
#df = df_clean
#normalize data
df_norm = normalize(df)
summary(df_norm)
## y age euribor3m
## no :36548 Min. :0.0000 Min. :0.0000
## yes: 4640 1st Qu.:0.1852 1st Qu.:0.1610
## Median :0.2593 Median :0.9574
## Mean :0.2842 Mean :0.6772
## 3rd Qu.:0.3704 3rd Qu.:0.9810
## Max. :1.0000 Max. :1.0000
sample = sample.split(df_norm$y, SplitRatio = .75)
train = subset(df_norm, sample == TRUE)
test = subset(df_norm, sample == FALSE)
dim(train)
## [1] 30891 3
dim(test)
## [1] 10297 3
rf <- randomForest(
y ~ .,
data=train
)
importance(rf)
## MeanDecreaseGini
## age 640.9315
## euribor3m 1697.0997
pred = predict(rf, newdata=test[-1])
cm = table(test[,1], pred)
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
## pred
## no yes
## no 8940 197
## yes 908 252
##
## Accuracy : 0.8927
## 95% CI : (0.8865, 0.8986)
## No Information Rate : 0.9564
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2672
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9078
## Specificity : 0.5612
## Pos Pred Value : 0.9784
## Neg Pred Value : 0.2172
## Prevalence : 0.9564
## Detection Rate : 0.8682
## Detection Prevalence : 0.8873
## Balanced Accuracy : 0.7345
##
## 'Positive' Class : no
##
plot(pred , test$y,main="Bagged Model",xlab="Predicted",ylab="Test Set Y")
abline (0,1)
LDA is not a good tool when there are categorical predictors and response variables. However the analysis here.
## Call:
## lda(Class ~ ., data = train_bankdata_ds)
##
## Prior probabilities of groups:
## no yes
## 0.5024188 0.4975812
##
## Group means:
## age jobblue-collar jobentrepreneur jobhousemaid jobmanagement
## no 38.87586 0.1977304 0.03473177 0.02372765 0.0801238
## yes 40.38646 0.1177083 0.02569444 0.02361111 0.0750000
## jobretired jobself-employed jobservices jobstudent jobtechnician
## no 0.03370014 0.03473177 0.10281981 0.01203576 0.1609354
## yes 0.09444444 0.03437500 0.06666667 0.05277778 0.1649306
## jobunemployed maritalmarried maritalsingle educationbasic.6y
## no 0.02097662 0.5773728 0.2940165 0.04779917
## yes 0.03437500 0.5295139 0.3618056 0.03784722
## educationbasic.9y educationhigh.school educationilliterate
## no 0.14993122 0.2678817 0.000343879
## yes 0.09826389 0.2427083 0.001041667
## educationprofessional.course educationuniversity.degree housingyes
## no 0.1313618 0.3290922 0.5381706
## yes 0.1378472 0.3993056 0.5600694
## loanyes contacttelephone monthaug monthdec monthjul monthjun
## no 0.1619670 0.3559147 0.1537139 0.00343879 0.1657497 0.1234525
## yes 0.1510417 0.1444444 0.1399306 0.01909722 0.1295139 0.1218750
## monthmar monthmay monthnov monthoct monthsep day_of_weekmon
## no 0.007909216 0.3342503 0.12379642 0.01581843 0.009628611 0.2107978
## yes 0.059722222 0.1791667 0.09791667 0.07361111 0.061458333 0.1815972
## day_of_weekthu day_of_weektue day_of_weekwed duration campaign pdays
## no 0.2070151 0.2077029 0.1887895 223.6228 2.575997 980.8982
## yes 0.2263889 0.2093750 0.2055556 524.0330 2.023611 784.1878
## previous poutcomenonexistent poutcomesuccess emp.var.rate cons.price.idx
## no 0.1444292 0.8758597 0.01616231 0.09920908 93.54671
## yes 0.4975694 0.6684028 0.20069444 -1.39652778 93.31208
## cons.conf.idx euribor3m nr.employed
## no -40.66967 3.667028 5171.081
## yes -39.80458 1.961340 5088.038
##
## Coefficients of linear discriminants:
## LD1
## age -0.0011902324
## jobblue-collar -0.0888352316
## jobentrepreneur -0.0262217669
## jobhousemaid -0.0102036344
## jobmanagement -0.0596353556
## jobretired 0.2695045085
## jobself-employed -0.0454606362
## jobservices -0.0244081974
## jobstudent 0.3791369431
## jobtechnician 0.1329959384
## jobunemployed 0.2260065299
## maritalmarried 0.0943189562
## maritalsingle 0.1236337792
## educationbasic.6y 0.0335797703
## educationbasic.9y 0.0038749005
## educationhigh.school -0.0132030059
## educationilliterate 1.1228617033
## educationprofessional.course 0.0382195325
## educationuniversity.degree 0.1733471306
## housingyes 0.0077398325
## loanyes -0.1025505459
## contacttelephone -0.3367896764
## monthaug 0.5218654575
## monthdec -0.1225151835
## monthjul 0.0002390292
## monthjun -0.3975591964
## monthmar 1.2659485471
## monthmay -0.4362814431
## monthnov -0.4384783281
## monthoct 0.0104165824
## monthsep 0.2526261124
## day_of_weekmon -0.0666560394
## day_of_weekthu 0.0007497524
## day_of_weektue -0.0017921098
## day_of_weekwed 0.1047784335
## duration 0.0027459462
## campaign -0.0092834282
## pdays -0.0006739991
## previous 0.0002493134
## poutcomenonexistent 0.3087063111
## poutcomesuccess 0.2295405552
## emp.var.rate -1.3064584484
## cons.price.idx 1.2819712430
## cons.conf.idx 0.0011031900
## euribor3m 0.4826144524
## nr.employed 0.0001728640
## no yes
## 988 942
## Confusion Matrix and Statistics
##
## ldap
## no yes
## no 825 126
## yes 163 816
##
## Accuracy : 0.8503
## 95% CI : (0.8336, 0.8659)
## No Information Rate : 0.5119
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7006
##
## Mcnemar's Test P-Value : 0.0342
##
## Sensitivity : 0.8350
## Specificity : 0.8662
## Pos Pred Value : 0.8675
## Neg Pred Value : 0.8335
## Prevalence : 0.5119
## Detection Rate : 0.4275
## Detection Prevalence : 0.4927
## Balanced Accuracy : 0.8506
##
## 'Positive' Class : no
##
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(test_bankdata_ds$Class, ldap)
## X-squared = 945.97, df = 1, p-value < 2.2e-16